1.  A Quick Example

	import org.apache.spark._;
	import org.apache.spark.streaming._;
	import org.apache.spark.streaming.StreamingContext._ ;
	val ssc = new StreamingContext(sc, Seconds(15));
	val lines = ssc.socketTextStream("localhost", 8091);
	val words = lines.flatMap(_.split(" "));
	val pairs = words.map(word => (word, 1));
	val wordCounts = pairs.reduceByKey(_ + _);
	wordCounts.print();
	ssc.start();
	ssc.awaitTermination();
	
	Then in your SSH, type in yhe following command:
	
	nc -lk 8091
	
	Then every lines typed in the terminal running the netcat server will be counted and printed on screen every second
	
2.  User-to-Item Collaborative Filter Recs (ALS)

	2.0 Run the following command to start spark shell:
		spark-shell --jars=/root/TrainingOnHDP/lib/spark-cassandra-connector_2.10-1.6.2.jar

	2.1 Get Reference Data for Enrichment
	
		val itemsDF = sqlContext.read.format("json").load("/root/labs/datasets/json/actors.json").withColumnRenamed("id", "itemId").as("items");

	2.2 Get Live Ratings from Cassandra

		val cassandraConfig = Map("keyspace" -> "sparklabs", "table" -> "item_ratings");
		val itemRatingsDF = sqlContext.read.format("org.apache.spark.sql.cassandra").options(cassandraConfig).load().select($"userid", $"itemid", $"rating", $"timestamp").withColumnRenamed("userid", "userId").withColumnRenamed("itemid", "itemId").as("itemRatings");
  
	2.3 Train ALS Model With the training dataset
	
		import org.apache.spark.ml.recommendation.ALS;
		val rank = 5;
		val maxIterations = 10;
		val convergenceThreshold = 0.01;
		val implicitPrefs = false;
		val alpha = 1.0;
		val nonnegative = true;
		val als = new ALS().setRank(rank).setMaxIter(maxIterations).setRegParam(convergenceThreshold).setImplicitPrefs(implicitPrefs).setAlpha(alpha).setNonnegative(nonnegative).setUserCol("userId").setItemCol("itemId").setRatingCol("rating").setPredictionCol("prediction");
	
	2.4 Train Model
	
		import org.apache.spark.ml.recommendation.ALSModel;
		val model = als.fit(itemRatingsDF);
		
	2.5 Save model in spark parquet format
		model.save(s"/root/labs/datasets/alsmodel.parquet");
		
	2.6 Run the following command to stop spark-shell
		sc.stop

3. Real Time Recommendation

	3.0 Run the following spark shell
		spark-shell --jars=/root/TrainingOnHDP/lib/spark-streaming-kafka_2.10-1.6.0.jar,/root/TrainingOnHDP/lib/kafka_2.10-0.8.2.1.jar,/root/TrainingOnHDP/lib/kafka-clients-0.8.2.1.jar,/root/TrainingOnHDP/lib/metrics-core-2.2.0.jar,/training/apps/spark/elasticsearch-spark_2.10-2.4.1.jar

	3.1 Open new SSH and run the following command to create Kafka topic
		/usr/hdp/2.6.3.0-235/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --delete --topic item_ratings
		/usr/hdp/2.6.3.0-235/kafka/bin/kafka-topics.sh --zookeeper localhost:2181 --create --topic item_ratings --partitions 1 --replication-factor 1

	3.2 Setup elasticsearch
	
		curl -XDELETE 'http://sandbox-hdp.hortonworks.com:9200/sparklabs'
		curl -XPUT 'http://sandbox-hdp.hortonworks.com:9200/sparklabs/' -d '{
			"settings": {
				"number_of_shards": 1,
				"number_of_replicas": 0
			}
		}'
	
	3.3 Start Spark Streaming 
	
		import org.apache.spark.streaming.kafka.KafkaUtils;
		import org.apache.spark.streaming.Seconds;
		import org.apache.spark.streaming.StreamingContext;
		import org.apache.spark.SparkContext;
		import org.apache.spark.sql.SQLContext;
		import org.apache.spark.SparkConf;
		import kafka.serializer.StringDecoder;
		import org.apache.spark.sql.SaveMode;
		import org.apache.spark.sql.Row;
		import org.apache.spark.rdd.RDD;
		import org.apache.spark.streaming.Time;
		import org.apache.spark.ml.recommendation.ALSModel;
		import org.elasticsearch.spark.sql._;

		val ssc = new StreamingContext(sc, Seconds(15));
		val brokers = "sandbox-hdp.hortonworks.com:6667";
		val topics = Set("item_ratings");
		val kafkaParams = Map[String, String]("metadata.broker.list" -> brokers);
		val ratingsStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics);
		val esConfig = Map("pushdown" -> "true", "es.nodes" -> "sandbox-hdp.hortonworks.com", "es.port" -> "9200");
		
		ratingsStream.foreachRDD {
			(message: RDD[(String, String)], batchTime: Time) => {
				val alsmodel = ALSModel.load(s"/root/labs/datasets/alsmodel.parquet");
				message.cache();
				val tokens = message.map(_._2.split(","));
				val ratings:RDD[(Int, Int)] = tokens.map(token => (token(0).toInt, token(1).toInt));
				val ratingsDF = ratings.toDF("userId", "itemId");
				val predictionDF = alsmodel.transform(ratingsDF);
				predictionDF.write.format("org.elasticsearch.spark.sql").mode(SaveMode.Append).options(esConfig).save("sparklabs/item_ratings");
				message.unpersist();
			}
		};
		ssc.start();
		ssc.awaitTermination();

	3.4 Publish the message into Kafka Topic
	
		/usr/hdp/2.6.3.0-235/kafka/bin/kafka-console-producer.sh --broker-list sandbox-hdp.hortonworks.com:6667 --topic item_ratings
		
		Input the following:
		
		5980,10010
	
	3.5 Connect to Elasticsearch
	
		3.5.1 Search per index:
		
		http://localhost:9200/sparklabs/_search
		
		3.5.2 List all indexes:
	
		http://localhost:9200/_cat/indices?v
		
		
	3.6 Connect to Kibana

		http://localhost:8744/
	
		3.6.1 Create Index Pattern 'sparklabs'
	
4. Build Real-Time Bidding Application

	4.1 Download the dataset from http://goo.gl/lwgoxw

	4.2 Unzip the file
	
		extract bid.20130311.txt and upload into /root/TrainingOnHDP/dataset/spark/bid at your sandbox
		
		extract city.en.txt and upload into /root/TrainingOnHDP/dataset/spark/city at your sandbox
		
	
	4.3 Sliding Window Function - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.SlidingWindow --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	

	4.4 Joining Streaming 1 - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.JoiningStreaming1 --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

		
	4.5 Joining Streaming 2 - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.JoiningStreaming2 --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
		
		
	4.6 Using the Dataset - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.UsingDataset --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
		
	4.7 Spark SQL Over Memory Sink - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.SparkSQLOverMemory --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

		
	4.8 Save to File Sink - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.SaveToFileSink --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar


	4.9 Monitoring Streaming - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.MonitoringStreaming --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar


	4.10 TrackingProgress - Run the following command:
	
		spark-submit --class ca.training.bigdata.spark.streaming.bidding.TrackingProgress --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
		
	
	4.11 KafkaIntegration - Run the following command:
	
		4.11.1 Create Kafka Topic
		
			/usr/hdp/2.6.3.0-235/kafka/bin/kafka-topics.sh --create --zookeeper localhost:2181 --replication-factor 1 --partitions 1 --topic class12Topic
		
		4.11.2 Start Spark Streaming
		
			spark-submit --class ca.training.bigdata.spark.streaming.kafka.KafkaIntegration --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/uber-StreamingApplicationOnSpark-1.0-SNAPSHOT.jar

		4.11.3 Open new Teminal and Input the sentence into Kafka Topic

			/usr/hdp/2.6.3.0-235/kafka/bin/kafka-console-producer.sh --broker-list sandbox-hdp.hortonworks.com:6667 --topic class12Topic
			
		
		4.11.4 Go back to 4.11.2 Console, you will see the messages are print out
		

	4.12 Custom Receiver 1 - Run the following command:

	
		4.12.1 Open New Terminal and Run the following command:

			nc -lk 9999

		4.12.2 Start Spark Streaming
		
			spark-submit --class ca.training.bigdata.spark.streaming.receiver.CustomReceiver --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/uber-StreamingApplicationOnSpark-1.0-SNAPSHOT.jar localhost 9999
			
		4.12.3 Go back to 4.12.1
			
			Any words type in the screen will be count and show up in 4.12.2 console-producer
			

	4.13 Custom Receiver 2 - Run the following command:
			
		spark-submit --class ca.training.bigdata.spark.streaming.receiver.HttpCustomReceiver --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/uber-StreamingApplicationOnSpark-1.0-SNAPSHOT.jar
	
		
5. Stateful and Transformation Example

	5.1 UpdateStateByKey Example

		spark-submit --class ca.training.bigdata.spark.streaming.UpdateStateByKeyExample --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
	5.2 MapWithState Example

		spark-submit --class ca.training.bigdata.spark.streaming.MapWithStateExample --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

	5.3 Transform Example

		spark-submit --class ca.training.bigdata.spark.streaming.TransformExample --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar

	5.4 TransformWith Example

		spark-submit --class ca.training.bigdata.spark.streaming.TransformWithExample --driver-memory 2G --executor-memory 2G --master local[2] /root/TrainingOnHDP/StreamingApplicationOnSpark/target/StreamingApplicationOnSpark-1.0-SNAPSHOT-jar-with-dependencies.jar
	
		